缺失数据

df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(df)

# 输出结果:
#          one       two     three
#  a -2.160823  1.205748 -0.903059
#  b       NaN       NaN       NaN
#  c  0.982933 -1.107031  2.163404
#  d       NaN       NaN       NaN
#  e -0.351059  0.493755 -0.096658

1> 检查缺失值

isnull() 和 notnull() 函数

df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(df['one'].isnull())

# 输出结果:
#  a    False
#  b     True
#  c    False
#  d     True
#  e    False
#  Name: one, dtype: bool

print(df['two'].notnull())

# 输出结果:
#  a     True
#  b    False
#  c     True
#  d    False
#  e     True
#  Name: two, dtype: bool

2> 缺失值的计算

在数据求和时,NaN将被视为 0, 如果数据全部是 NaN,那么结果是 NaN

df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(f'原数组:{df}')

# 输出结果:
#  原数组:        one       two     three
#  a  0.024880 -0.615047  0.335393
#  b       NaN       NaN       NaN
#  c  0.630599 -1.647667 -0.260803
#  d       NaN       NaN       NaN
#  e  0.845244  0.391965 -2.285933

print(f"数组求和1 : {np.sum(df['two'])}")

# 输出结果:
#  数组求和1 : -1.870749246793861
print(f'数组求和2 : {df["two"].sum()}')

# 输出结果:
#  数组求和2 : -1.870749246793861

3> 缺失数据填充

fillna()函数

df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(f'原数组:\n{df}')

# 输出结果:
#  原数组:
#          one       two     three
#  a  1.474877  0.180455 -0.104257
#  b       NaN       NaN       NaN
#  c -0.724581  2.251094 -0.073617
#  d       NaN       NaN       NaN
#  e  1.950274  0.381642 -0.320262

print(f"用 0 填充 NaN 后的数组 :\n {df.fillna(0)}")

# 输出结果:
#  用 0 填充 NaN 后的数组 :
#           one       two     three
#  a  1.474877  0.180455 -0.104257
#  b  0.000000  0.000000  0.000000
#  c -0.724581  2.251094 -0.073617
#  d  0.000000  0.000000  0.000000
#  e  1.950274  0.381642 -0.320262

print(f'用 3 填充 NaN 后的数组 :\n {df.fillna(3)}')

# 输出结果:
#  用 3 填充 NaN 后的数组 :
#           one       two     three
#  a  1.474877  0.180455 -0.104257
#  b  3.000000  3.000000  3.000000
#  c -0.724581  2.251094 -0.073617
#  d  3.000000  3.000000  3.000000
#  e  1.950274  0.381642 -0.320262

4> 向前或向后填充

df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(f'原数组:\n{df}')

# 输出结果:
#  原数组:
#          one       two     three
#  a -0.322989  1.459974  0.246409
#  b       NaN       NaN       NaN
#  c  1.428074  0.103128  0.813241
#  d       NaN       NaN       NaN
#  e  0.031593  1.016594  2.086443

print(f"用 0 填充 NaN 后的数组 :\n {df.fillna(method = 'pad')}")

# 输出结果:
#  用 0 填充 NaN 后的数组 :
#           one       two     three
#  a -0.322989  1.459974  0.246409
#  b -0.322989  1.459974  0.246409
#  c  1.428074  0.103128  0.813241
#  d  1.428074  0.103128  0.813241
#  e  0.031593  1.016594  2.086443

print(f'用 3 填充 NaN 后的数组 :\n {df.fillna(method = "backfill")}')

# 输出结果:
#  用 3 填充 NaN 后的数组 :
#           one       two     three
#  a -0.322989  1.459974  0.246409
#  b  1.428074  0.103128  0.813241
#  c  1.428074  0.103128  0.813241
#  d  0.031593  1.016594  2.086443
#  e  0.031593  1.016594  2.086443

5> 清除缺失值

在默认情况下,axis = 0,即在行上应用;

如果行内的任何值是 NaN,那么整行被排除

df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])

print(df.dropna())

# 输出结果:
#          one       two     three
#  a -2.129920 -1.001321 -1.522123
#  c  0.182309  0.285505  1.195824
#  e -1.163222 -0.938291  0.151461

print(df.dropna(axis = 1))

# 输出结果:
#  Empty DataFrame
#  Columns: []
#  Index: [a, b, c, d, e]

6> 值替换

df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(df.replace({np.NaN:0.14}))

# 输出结果:
#          one       two     three
#  a  0.844849  0.308399  0.428219
#  b  0.140000  0.140000  0.140000
#  c -1.915478  0.713083 -0.365932
#  d  0.140000  0.140000  0.140000
#  e -0.291266 -1.076788  1.454755